library(tidyverse)
library(ggplot2)
library(dplyr)
library(broom)
library(countrycode)
library(tidytext)
library(viridis)
library(rworldmap)
library(ggmap)
library(maps)
library(sp)
library(maptools)
library(readr)
library(ggraph)
library(igraph)
library(RColorBrewer)
library(gganimate)
library(gifski)
library(ggalluvial)
library(hrbrthemes)
library(kableExtra)
library(leaflet)
library (scales)
library (shiny)
library(ggalluvial)
library(grid)
library(gridExtra)
data <- read_csv("data/athlete_events.csv")
dat1 <- read_csv("data/countries.csv")

Medal distribution by country

medal <- data %>%
  filter(!is.na(Medal))

team<- medal %>% 
  group_by(NOC) %>% 
  summarise(Gold =  sum (Medal == "Gold"),
            Silver = sum (Medal == "Silver"),
            Bronze = sum (Medal == "Bronze"))
team_M <- team %>%
  pivot_longer(!"NOC", names_to = "medals", values_to = "number_of_medals")

country_M <- countrycode(team_M$NOC, "ioc", "country.name")

country_M <- data_frame(country_M, team_M)
country_M<- na.omit(country_M)
team_T <- cbind(team, c(rowSums(team [, 2:4], na.rm = TRUE))) 
colnames(team_T)[5] <- "Total"

country_T <- countrycode(team_T$NOC, "ioc", "country.name")

country_T <- data_frame(country_T, team_T)
country_T<- na.omit(country_T)

country_T <- country_T %>%
  arrange(desc(Total)) %>% 
  rename("Country" = country_T)

country_T_10 <- country_T%>% 
  top_n(10)

knitr::kable(country_T_10, caption = "The medals in different country", col.names = c("Country",  "NOC", "Gold", "Silver", "Bronze", "Total"))
The medals in different country
Country NOC Gold Silver Bronze Total
United States USA 2638 1641 1358 5637
Germany GER 745 674 746 2165
United Kingdom GBR 678 739 651 2068
France FRA 501 610 666 1777
Italy ITA 575 531 531 1637
Sweden SWE 479 522 535 1536
Canada CAN 463 438 451 1352
Australia AUS 348 455 517 1320
Russia RUS 390 367 408 1165
Hungary HUN 432 332 371 1135
ct <- inner_join(country_T, dat1, by = "Country")
world <- map_data("world")
visit.x<-ct$Longitude
visit.y<-ct$Latitude
hex_codes <- hue_pal(h=c(180,270)) (length(ct$Country)) 
pal <- colorFactor(hex_codes, domain = ct$Country)

mytext <- paste(
   "Country: ", ct$Country, 
   "Total: ", ct$Total, 
   "Gold: ", ct$Gold,
   "Silver: ", ct$Silver,
   "Bronze: ", ct$Bronze) %>%
  lapply(htmltools::HTML)

map <- leaflet(ct) %>% 
  addTiles() %>%
  addCircles(
    lng = ~Longitude,
    lat = ~Latitude,
    radius = ~Total*300,
    stroke = F,
    fillOpacity = 0.4,
    color = ~pal(Country),
    label = mytext
  ) %>%
  addLegend ("topright", 
             pal=pal, 
             values = ~Country,
             title = "Country",
             opacity = 1
  )

map

The map shows the number of medals won by each country in the world at the Olympic Games, including gold, silver, bronze and total. The size of the circles on the map indicates the number of medals won, so it is easy to see that the USA has the most medals. Europe has the highest number of medals, and the density of the circles shows that most European countries have won medals and have accumulated a significant number of medals in total.

Find out which sport has the largest number of participants and study the distribution of gold MEDALS in different countries over time

sport_count<- data %>% 
  mutate(Number_of_people_in_each_sport = Sport) %>% 
  count(Number_of_people_in_each_sport) %>% 
  top_n(10) 
sport_count %>% 
  mutate(Number_of_people_in_each_sport = fct_reorder(Number_of_people_in_each_sport, n)) %>% 
  ggplot(aes(x = Number_of_people_in_each_sport, 
             y = n, 
             fill = Number_of_people_in_each_sport)) + 
  coord_flip() + 
  geom_text(aes(x = Number_of_people_in_each_sport,  
                y = n + 4000,
                label = n)) + 
  geom_col() + 
  xlab("Sport") +
  ylab("Number of people in each sport") + 
  theme(strip.text = element_text(size = 10), 
        axis.text = element_text(size = 10), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.15),
        axis.title.x = element_text(size = 15), 
        axis.title.y = element_text(size = 15))
Top 10 sports with the most athletes

Top 10 sports with the most athletes

With the Figure @ref(fig:wei1), It illustrate the number of participants in different sports, and select the top 10 of them, according to the figure, the most popular sports is Athletics which is 38,624. the second is gymnastics and the third one is swimming. the fine thing about this figure is that the number of participate athletes of first 3 sport is far more than any other sports.

Athletics_1 <- data %>% 
  filter(Sport == "Athletics" & Medal == "Gold")%>% 
  select(Sport, NOC, Medal, Year) %>% 
  cbind(1) %>% 
  rename("Number" = "1") %>% 
  group_by(NOC, Year) %>%
  summarise(Total = sum(Number)) %>% 
  mutate(Accumu = cumsum(Total)) 

Athletics <- inner_join(Athletics_1, country_T, by="NOC")
ranking <- Athletics %>%
  group_by(Year) %>%
  mutate(rank = rank(-Accumu),
         Accumu_rel = Accumu/Accumu[rank==1],
         Accumu_lbl = paste0(" ",round(Accumu/1e9))) %>%
  group_by(Country) %>% 
  filter(rank <=5) %>%
  ungroup()
ggp <- ggplot(ranking, 
              aes(x = rank, 
                  y = Accumu, 
                  group = Country)) +
  geom_bar(stat = "identity", 
           aes(fill = Country)) +
  transition_states(Year, transition_length = 2, state_length = 0) + 
  ease_aes('quadratic-in-out') +  
  enter_drift(x_mod = -1) + exit_drift(x_mod = 1) +
  labs(x = "Ranking of the gold medal accumulated",
       y = "Accumulated Number of gold medal",
       title = "Year {closest_state}")

animate(ggp, 200, fps = 20,  width = 1200, height = 1000, 
        renderer = gifski_renderer("gganim.gif"))
In the Athletics sports, the ranking of first 5 contries about Total number of Cumulative gold MEDALS won

In the Athletics sports, the ranking of first 5 contries about Total number of Cumulative gold MEDALS won

Then we focus on the athletics sports, the above Figure @ref(fig:wei2) describes only in the Athletics sports, it shows number of Cumulative gold MEDALS won in each country and ranking them. The x-axis is about first 5 ranking, the y-axis is about the Accumulated Number of gold medal, and the different color means different countries, so there are 2 interesting finding in this GIF,

How about the Medals of top 5 countries allocated in the events (Ziang_Li)

top1 <- country_T %>%
  top_n(5)

aa <- sport_count %>% 
  arrange(desc(n)) %>% 
  top_n(5)

ttt <- top1 %>% 
  inner_join(data, top1, by="NOC") %>% 
  filter (!is.na (Medal)) %>% 
  filter ( Sport == "Athletics" | Sport == "Gymnastics" | Sport == "Swimming" | Sport == "Shooting" | Sport == "Cycling") %>% 
  select (Country, Sport, Event, Medal) %>% 
  group_by(Country, Sport, Event, Medal) %>% 
  tally() %>% 
  ungroup() %>% 
  group_by(Country, Sport, Event) %>% 
  summarise( n = sum(n)) 

Athletics <- ttt %>% 
  ungroup() %>% 
  filter(Sport == "Athletics") %>% 
  group_by(Event) %>% 
  summarise( tot = sum(n)) %>% 
  arrange(desc(tot)) %>% 
  top_n(5)

Cycling <- ttt %>% 
  ungroup() %>% 
  filter(Sport == "Cycling") %>% 
  group_by(Event) %>% 
  summarise( tot = sum(n)) %>% 
  arrange(desc(tot)) %>% 
  top_n(5)

Gymnastics <- ttt %>% 
  ungroup() %>% 
  filter(Sport == "Gymnastics") %>% 
  group_by(Event) %>% 
  summarise( tot = sum(n)) %>% 
  arrange(desc(tot)) %>% 
  top_n(5)

Shooting <- ttt %>% 
  ungroup() %>% 
  filter(Sport == "Shooting") %>% 
  group_by(Event) %>% 
  summarise( tot = sum(n)) %>% 
  arrange(desc(tot)) %>% 
  top_n(5)

Swimming <- ttt %>% 
  ungroup() %>% 
  filter(Sport == "Swimming") %>% 
  group_by(Event) %>% 
  summarise( tot = sum(n)) %>% 
  arrange(desc(tot)) %>% 
  top_n(5)

rrr <- rbind(Athletics, Cycling, Gymnastics, Shooting, Swimming) 

ooo <- left_join(rrr, ttt, by="Event") 
ggplot(as.data.frame(ooo),
       aes(y = n, axis1 = Country, axis2 = Sport, axis3 = Event)) +
  geom_alluvium(aes(fill = Country), width = 1/40) +
  geom_stratum(width = 1/50, fill = "white", color = "black") +
  geom_label(stat = "stratum", aes(label = after_stat(stratum)))+
  scale_x_discrete(limits = c("Country", "Sport", "Event"), expand = c(.05, .05)) +
  scale_fill_brewer(type = "qual", palette = "Set3") +
  ggtitle("Distribution of the top 5 Countries' medals among the 5 sports with the most participants")

qqq <- data %>% 
  filter(!is.na(Medal)) %>%
  select (NOC, Year, Medal)%>% 
  cbind (1) %>% 
  rename( "Number" = "1") %>% 
  select(NOC, Year, Number) 

qqq <- left_join(qqq, country_T) 

www <- qqq %>% 
  select(Country, Year, Number) %>% 
  group_by(Country, Year) %>% 
  summarise( number = sum(Number)) %>% 
  mutate(total = cumsum(number)) %>% 
  filter(Country == "United States" | Country == "Germany" | Country == "United Kingdom" | Country == "France" | Country == "Italy")
q <- ggplot(www, aes(x=Year, y=total, group=Country, color=Country)) +
    geom_line() +
    geom_point() +
    scale_color_viridis(discrete = TRUE) +
    ggtitle("Changes in the total number of medals of the top 5 countries in the previous Olympic Games") +
    theme_ipsum() +
    ylab("Number of the medals") +
    transition_reveal(Year)

animate(q, nframes = 350,fps = 25,  width = 1200, height = 1000, 
        renderer = gifski_renderer("yearly.gif"))

First question: Compare the total medal won by different age for both female and male.

data<- read.csv("Data/athlete_events.csv")
clean_data <- data %>% 
  select (c("Sex","Age","Medal")) %>% 
  mutate(Sex=as.factor(Sex),
        Medal= as.factor(Medal)) %>% 
  group_by(Sex,Age) %>% 
  count(Medal) 

clean_data
## # A tibble: 425 x 4
## # Groups:   Sex, Age [138]
##    Sex     Age Medal      n
##    <fct> <int> <fct>  <int>
##  1 F        11 Silver     1
##  2 F        11 <NA>      11
##  3 F        12 Bronze     1
##  4 F        12 Silver     3
##  5 F        12 <NA>      28
##  6 F        13 Bronze     2
##  7 F        13 Gold       5
##  8 F        13 Silver     6
##  9 F        13 <NA>     138
## 10 F        14 Bronze    15
## # … with 415 more rows
clean_data1 <- clean_data %>% 
mutate(age_group = cut(Age,breaks = c(0,15,30,45,60,75,90))) %>% 
select(-Age) %>% 
  group_by(Sex,age_group) %>% 
  count(Medal) %>% 
  rename(number = "n")
## Adding missing grouping variables: `Age`
clean_data1
## # A tibble: 48 x 4
## # Groups:   Sex, age_group [13]
##    Sex   age_group Medal  number
##    <fct> <fct>     <fct>   <int>
##  1 F     (0,15]    Bronze      4
##  2 F     (0,15]    Gold        3
##  3 F     (0,15]    Silver      5
##  4 F     (0,15]    <NA>        5
##  5 F     (15,30]   Bronze     15
##  6 F     (15,30]   Gold       15
##  7 F     (15,30]   Silver     15
##  8 F     (15,30]   <NA>       15
##  9 F     (30,45]   Bronze     15
## 10 F     (30,45]   Gold       14
## # … with 38 more rows
table1<- clean_data1 %>% 
    group_by (Sex,age_group) %>% 
  summarise(total_medal_in_age_group= sum(number))
  
table1
## # A tibble: 13 x 3
## # Groups:   Sex [2]
##    Sex   age_group total_medal_in_age_group
##    <fct> <fct>                        <int>
##  1 F     (0,15]                          17
##  2 F     (15,30]                         60
##  3 F     (30,45]                         59
##  4 F     (45,60]                         32
##  5 F     (60,75]                         15
##  6 F     <NA>                             4
##  7 M     (0,15]                          15
##  8 M     (15,30]                         60
##  9 M     (30,45]                         60
## 10 M     (45,60]                         60
## 11 M     (60,75]                         31
## 12 M     (75,90]                          6
## 13 M     <NA>                             6
compare the percentage of thetotal medal won by different age groups for both female and male
Sex total_medal_by_sex age_group total_medal_in_age_group Percentage
F 187 (0,15] 17 9.090909
F 187 (15,30] 60 32.085561
F 187 (30,45] 59 31.550802
F 187 (45,60] 32 17.112299
F 187 (60,75] 15 8.021390
F 187 NA 4 2.139037
M 238 (0,15] 15 6.302521
M 238 (15,30] 60 25.210084
M 238 (30,45] 60 25.210084
M 238 (45,60] 60 25.210084
M 238 (60,75] 31 13.025210
M 238 (75,90] 6 2.521008
M 238 NA 6 2.521008

In Table @ref(tab:table1) we compare the total medal won by different age groups for both female and male.

From this table, it shows that female younger athletes age range from 15- 30 has the highest percentage (32.09%) of total medal won compared to other age group, while older female athletes age from 60 to 75 has the least percentage of medal won (8.02%). However, for males athletes, it appears that age group between (15-30),(30-45),(45-60_ share the same percentage( and highest) of total medal won, while older male athletes age form 75 to 90 again has the least percentage of medal won ( 2.52%).

The percentage of total medal won in different age group by sex

The percentage of total medal won in different age group by sex

In Figure @ref(fig:figure1), we have plotted the percentage of the total medal won by different age group and compared in against both gender of male and female.

@ref(fig:figure1), depicts that for the age groups aged (0-15),(15-30),and (30-45), female athletes is accounted of a higher proportion of the total medal compared to males, However by the age group (45 to 60), males athletes exceed female athletes in the proportion of total medal won and continues to have a higher proportion than female in the older age bracket.

##Second question: Comparison of different medal distribution between age group by gender.

clean_data2 <- data %>% 
  select (c("Sex","Age","Medal")) %>% 
  mutate(Sex=as.factor(Sex),
        Medal= as.factor(Medal)) %>% 
  group_by(Sex,Age) %>% 
  count(Medal) %>% 
  rename("number"=n) 
  clean_data2 
## # A tibble: 425 x 4
## # Groups:   Sex, Age [138]
##    Sex     Age Medal  number
##    <fct> <int> <fct>   <int>
##  1 F        11 Silver      1
##  2 F        11 <NA>       11
##  3 F        12 Bronze      1
##  4 F        12 Silver      3
##  5 F        12 <NA>       28
##  6 F        13 Bronze      2
##  7 F        13 Gold        5
##  8 F        13 Silver      6
##  9 F        13 <NA>      138
## 10 F        14 Bronze     15
## # … with 415 more rows
gold1<- clean_data2 %>% 
  filter(Medal== "Gold") %>% 
  group_by(Sex) %>% 
  summarise(total_silver_medal=sum(number)) 
gold<- clean_data2 %>% 
  filter(Medal== "Gold") %>% 
  left_join(gold1 ,by = "Sex") %>% 
  mutate("Percentage" = ((number/total_silver_medal)*100)) 
silver1<- clean_data2 %>% 
  filter(Medal== "Silver") %>% 
  group_by(Sex) %>% 
  summarise(total_silver_medal=sum(number)) 
silver<- clean_data2 %>% 
  filter(Medal== "Silver") %>% 
  left_join(silver1 ,by = "Sex") %>% 
  mutate("Percentage" = ((number/total_silver_medal)*100)) 
bronze1<- clean_data2 %>% 
  filter(Medal== "Bronze") %>% 
  group_by(Sex) %>% 
  summarise(total_bronze_medal=sum(number)) 
bronze<- clean_data2 %>% 
  filter(Medal== "Bronze") %>% 
  left_join(bronze1 ,by = "Sex") %>% 
  mutate("Percentage" = ((number/total_bronze_medal)*100)) 
gold
## # A tibble: 89 x 6
## # Groups:   Sex, Age [89]
##    Sex     Age Medal number total_silver_medal Percentage
##    <fct> <int> <fct>  <int>              <int>      <dbl>
##  1 F        13 Gold       5               3747      0.133
##  2 F        14 Gold      20               3747      0.534
##  3 F        15 Gold      66               3747      1.76 
##  4 F        16 Gold     103               3747      2.75 
##  5 F        17 Gold     133               3747      3.55 
##  6 F        18 Gold     160               3747      4.27 
##  7 F        19 Gold     177               3747      4.72 
##  8 F        20 Gold     188               3747      5.02 
##  9 F        21 Gold     265               3747      7.07 
## 10 F        22 Gold     310               3747      8.27 
## # … with 79 more rows
gold_plot<- ggplot(data=gold, aes(  x= Age,
                                      y= Percentage,
                                      fill= Sex)) +
  geom_density (alpha = 0.4, stat = "identity", position = "identity")+

  ggtitle("The distribution of gold medal won by different age group for female and male")
silver 
## # A tibble: 100 x 6
## # Groups:   Sex, Age [100]
##    Sex     Age Medal  number total_silver_medal Percentage
##    <fct> <int> <fct>   <int>              <int>      <dbl>
##  1 F        11 Silver      1               3735     0.0268
##  2 F        12 Silver      3               3735     0.0803
##  3 F        13 Silver      6               3735     0.161 
##  4 F        14 Silver     25               3735     0.669 
##  5 F        15 Silver     59               3735     1.58  
##  6 F        16 Silver    101               3735     2.70  
##  7 F        17 Silver    100               3735     2.68  
##  8 F        18 Silver    158               3735     4.23  
##  9 F        19 Silver    179               3735     4.79  
## 10 F        20 Silver    205               3735     5.49  
## # … with 90 more rows
silver_plot<- ggplot(data=silver, aes(  Age,
                                      y= Percentage,
                                      fill= Sex)) +
   geom_density (alpha = 0.4, stat = "identity", position = "identity")+
  ggtitle("The distribution of silver medal won by different age group for female and male ")
bronze
## # A tibble: 99 x 6
## # Groups:   Sex, Age [99]
##    Sex     Age Medal  number total_bronze_medal Percentage
##    <fct> <int> <fct>   <int>              <int>      <dbl>
##  1 F        12 Bronze      1               3771     0.0265
##  2 F        13 Bronze      2               3771     0.0530
##  3 F        14 Bronze     15               3771     0.398 
##  4 F        15 Bronze     51               3771     1.35  
##  5 F        16 Bronze     86               3771     2.28  
##  6 F        17 Bronze    110               3771     2.92  
##  7 F        18 Bronze    139               3771     3.69  
##  8 F        19 Bronze    167               3771     4.43  
##  9 F        20 Bronze    216               3771     5.73  
## 10 F        21 Bronze    265               3771     7.03  
## # … with 89 more rows
bronze_plot<- ggplot(data=bronze, aes(x=Age,
                                      y= Percentage,
                                      fill= Sex)) +
   geom_density (alpha = 0.4, stat = "identity", position = "identity")+
  ggtitle("The distribution of bronze medal won by different age group for female and male ")
grid.arrange(gold_plot,silver_plot,bronze_plot, ncol=2)
The medal distribution won in different age group by sex

The medal distribution won in different age group by sex

In Figure @ref(fig:figure2), the different medal distribution was plotted for different age and compared in against both gender of male and female.

From this plot. the gold medal distribution for female and male are positively skewed with the athletes in the younger age group accounting for more of the gold medals earned than the older athletes (specifically, both female and male athletes age in early 20s have the highest percentage of the gold medal won).Similar result could be seen for silver medal distribution, both distribution for female and male athlete are positively skewed, however we can see that the age group that account the most percentage of silver medal won is ranged from 20-30 years older for both female and male athletes. In the bronze medal distribution, it shares similar distribution as silver medal as athletes age from 20 to 30 is account for the most percentage of bronze medal won for both female and male. Additionally, it is seen that for all medal distribution, female athletes of younger age (0-20) tend to account for higher percentage of medal (gold, silver, and bronze) than male athletes, however by age 30 and over, male athletes exceed female athletes in medal won for gold, silver and bronze medal. This could be due the quicker fall of physical, technical and strategics abilities of females athletes 30 and over , companied with increasing social pressure that female of an older age to be more family orientated.

Hence from the above analysis, we conclude that in general for both gender, age 20 to 30 tends to account for most proportions of the medals (gold, silver, and bronze)won, furthermore, it was found that female athletes tends to do better than male for medal wons before age20, however later was exceed by male after age 30.

Conclusion

In summary, from the presentation today, we conclude that for quesiton1 when comparing the distribution of medal in different country aound the world, America appears to have the most medals in the world. In addtional, for question two, we have found that the sport athletics have the largest number of athletics, and in this sports, The United States accumulated the most gold MEDALS in each year. Furthermore for question three, we conclude that The United States has an absolute advantage in swimming events,The medals of United Kingdom are more distributed in Cycling. and that the Medals from other countries are very evenly distributed among the five events lastly. when comparing the medals won for different age group by gender, it was revealed that in general for both gender, age bracket from 20-40 and 40 to 60 in general has the most proporiton of medal won compared to other age group for gold, silver and bronze medal won.